The zip code was incorectly isolated from the address
Removed them ::: {.cell layout-align=“center”}
Code
#remove the rows with nan in cityproperties_filtered <- df[!is.na(df$City),]
:::
4.3 Tax data cleaning
Code
#excel file modified for fribourg and liestal # read csvimpots <-read.csv(file.path(here(),"data/statistik-steuerfuesse-np-1995-2023-fr.csv"), sep =",", header =TRUE, stringsAsFactors =FALSE)# Remove last 13 rowsimpots <-head(impots, -14)# Rename columnscolnames(impots) <-c("Chefs-lieux cantonaux", "Impôt cantonal", "Impôt communal", "Impôt paroissial Evang. / réform", "Impôt paroissial Kath. / cath.rom")# Set row names to the content of the first columnrownames(impots) <- impots$`Chefs-lieux cantonaux`# Remove the first column (which is now row names)impots1 <- impots[, -1, drop =FALSE]# Set the row names for impots1 the same as impotsrownames(impots1) <-rownames(impots)impots <- impots1rm(impots1)# Write data to CSV based on here()write.csv(impots, file.path(here(),"data/impots.csv"), row.names =TRUE)
4.3.1 Merging the two datasets
Code
# Create a copy of properties_filteredproperties_merged <- properties_filtered# Loop through unique canton names in properties_filteredfor (canton_name inunique(properties_filtered$canton)) {# Check if canton_name exists as a row name in impotsif (canton_name %in%rownames(impots)) {# Get the corresponding row from impots impots_row <- impots[canton_name, ]# Add columns to properties_merged with impots contentfor (col_name innames(impots)) { properties_merged[properties_merged$canton == canton_name, paste0(col_name, "_impots")] <- impots_row[[col_name]] }# Remove impots_row from memoryrm(impots_row) }}# Write data to CSV based on here()write.csv(properties_merged, file.path(here(),"data/properties_merged.csv"), row.names =TRUE)
##Dataset used for the rest of the analysis ::: {.cell layout-align=“center”}
Code
properties_filtered <- properties_merged
:::
4.4 Cleaning of commune data
Replaces NAs in both Taux de couverture social and Political (Conseil National Datas) For Taux de couverture Social: NAs were due to reason “Q” = “Not indicated to protect confidentiality” We replaced the NAs by the average taux de couverture in Switzerland in 2019, which was 3.2%
For Political data: NAs were due to reason “M” = “Not indicated because data was not important or applicable” Therefore, we replaced the NAs by 0
Code
# il faudra changer le path# commune_prep <- read.csv(file.path(here(),"data/commune_data.csv"), sep = ";", header = TRUE, stringsAsFactors = FALSE)# # # We keep only 2019 to have some reference? (2020 is apparently not really complete)# commune_2019 <- subset(commune_prep, PERIOD_REF == "2019") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE", "STATUS"))# # # delete les lignes ou Status = Q ou M (pas de valeur) et ensuite on enlève la colonne# commune_2019 <- subset(commune_2019, STATUS == "A") %>%# select(c("REGION", "CODE_REGION", "INDICATORS", "VALUE"))# # # on enlève les lignes qui sont des aggrégats# commune_2019 <- subset(commune_2019, REGION != "Schweiz")# # commune_2019 <- commune_2019 %>%# pivot_wider(names_from = INDICATORS, values_from = VALUE)# # # Rename columns using the provided map# df_commune <- commune_2019 %>%# rename(`Population - Habitants` = Ind_01_01,# `Population - Densité de la population` = Ind_01_03,# `Population - Etrangers` = Ind_01_08,# `Population - Part du groupe d'âge 0-19 ans` = Ind_01_04,# `Population - Part du groupe d'âge 20-64 ans` = Ind_01_05,# `Population - Part du groupe d'âge 65+ ans` = Ind_01_06,# `Population - Taux brut de nuptialité` = Ind_01_09,# `Population - Taux brut de divortialité` = Ind_01_10,# `Population - Taux brut de natalité` = Ind_01_11,# `Population - Taux brut de mortalité` = Ind_01_12,# `Population - Ménages privés` = Ind_01_13,# `Population - Taille moyenne des ménages` = Ind_01_14,# `Sécurité sociale - Taux d'aide sociale` = Ind_11_01,# `Conseil national - PLR` = Ind_14_01,# `Conseil national - PDC` = Ind_14_02,# `Conseil national - PS` = Ind_14_03,# `Conseil national - UDC` = Ind_14_04,# `Conseil national - PEV/PCS` = Ind_14_05,# `Conseil national - PVL` = Ind_14_06,# `Conseil national - PBD` = Ind_14_07,# `Conseil national - PST/Sol.` = Ind_14_08,# `Conseil national - PES` = Ind_14_09,# `Conseil national - Petits partis de droite` = Ind_14_10)# # # If no one voted for a party, set as NA -> replacing it with 0 instead# df_commune <- df_commune %>%# mutate_at(vars(starts_with("Conseil national")), ~replace_na(., 0))# # # # Removing NAs from Taux de couverture sociale column# # Setting the mean as the mean for Switzerland in 2019 (3.2%)# mean_taux_aide_social <- 3.2# # # Replace NA values with the mean# df_commune <- df_commune %>%# mutate(`Sécurité sociale - Taux d'aide sociale` = if_else(is.na(`Sécurité sociale - Taux d'aide sociale`), mean_taux_aide_social, `Sécurité sociale - Taux d'aide sociale`))#
histogram_price <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="red") +labs(title ="Distribution of Prices",x ="Price",y ="Frequency") +theme_minimal()# Convert ggplot object to plotly objectinteractive_histogram_price <-ggplotly(histogram_price)# Display the interactive histograminteractive_histogram_price
5.3 Histogram of prices for each property type
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create the ggplot objecthistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ property_type, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Property Type",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram <-ggplotly(histogram)# Display the interactive plotinteractive_histogram
5.4 Histogram of prices for each year category
note : only price between 0 and 500000 so some outliers aren’t here
Code
# Create a histogram of prices for each year categoryhistogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ year_category, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Year Category",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly objectinteractive_histogram_year <-ggplotly(histogram)# Display the interactive plotinteractive_histogram_year
5.5 Histogram of prices for each canton
note : only price between 0 and 500000 so some outliers aren’t here
Code
histogram <-ggplot(properties_filtered, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ canton, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Canton",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram) %>%layout(height =1000) # Adjust the height as needed# Display the interactive plotinteractive_histogram
5.6 Histogram of prices for each number of rooms
note : only price between 0 and 500000 so some outliers aren’t here
and the graph below only show apartments with less than 10 rooms (but you can change the code if needed
Code
properties_room <- properties_filtered[properties_filtered$number_of_rooms <20, ] # Filter only number_of_rooms less than 20# Create a histogram of prices for each number of roomshistogram <-ggplot(properties_room, aes(x = price)) +geom_histogram(binwidth =100000, fill ="skyblue", color ="black") +facet_wrap(~ number_of_rooms, scales ="free", ncol =2) +labs(title ="Distribution of Prices by Number of Rooms",x ="Price",y ="Frequency") +theme_minimal() +xlim(0, 5000000)# Convert ggplot object to plotly object with adjusted heightinteractive_histogram <-ggplotly(histogram) %>%layout(height =1000) # Adjust the height as needed# Display the interactive plotinteractive_histogram
5.7 Histogram of prices with impot
Click to show code
colnames(properties_filtered)[(ncol(properties_filtered) -3):ncol(properties_filtered)] <-gsub("\\s+", "_", colnames(properties_filtered)[(ncol(properties_filtered) -3):ncol(properties_filtered)])# Create a scatter plot to visualize correlation between price and Impôt cantonalscatter_plot <-ggplot(properties_filtered, aes(x = price, y = Impôt_cantonal_impots)) +geom_point() +labs(title ="Correlation between Price and Impôt cantonal",x ="Price",y ="Impôt cantonal") +theme_minimal()# Convert ggplot object to plotly objectinteractive_plot <-ggplotly(scatter_plot)# Display the interactive plotinteractive_plot